Frequent Words Problem


In [ ]:
%matplotlib inline
import os
import sys
#import pysam
import pandas as pd
#import pybedtools as pybed
import matplotlib as mpl
import matplotlib.pyplot as plt
from __future__ import division

# Libraries just for this problem
import operator
import itertools

Define Functions


In [11]:
def occurrences(string, sub):
    """ Counts overlapping string occurrences """
    count = start = 0
    while True:
        start = string.find(sub, start) + 1
        if start > 0:
            count += 1
        else:
            return count

In [ ]:
def max_dict_by_value(dictionary):
    """Returns top dictionary key and value"""
    return dict(max(dictionary.iteritems(), key=operator.itemgetter(1)))

In [38]:
def most_frequent_kmer1(seq, kmer_len, num_kmers):
    """ Returns most frequent kmer of length kmer_len
        within a sequence of interest"""
    kmers = dict()
    for i in range(len(seq) - kmer_len + 1):
        kmer = string[i:i+kmer_len]
        if kmer in kmers:
            kmers[kmer] += 1
        else:
            kmers[kmer] = 1
    for i in range(num_kmers):
        maximum = max_dict_by_value(kmers)
        key = maximum.keys()
        kmers.pop(maximum.key(), None)

In [38]:
def most_frequent_kmer2(seq, kmer_len):
    """ Returns most frequent kmer of length kmer_len
        within a sequence of interest"""
    kmers = dict()
    perms = [''.join(p) for p in itertools.product("ACTG", repeat=kmer_len)]
    for perm in perms:
        kmers[perm] = occurrences(string, perm)

Run Tests


In [6]:
string = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
kmer_len = 4
kmers = dict()

In [9]:
perms = [''.join(p) for p in permutations('ACTG')]
dictionary = dict()
for perm in perms:
    dictionary[perm] = occurrences(string, perm)

In [16]:
sorted_kmers = sorted(kmers.iteritems(), key=operator.itemgetter(1))


Out[16]:
1

Call Function


In [40]:
most_frequent_kmer1(string, kmer_len)


Out[40]:
'CATG'

In [33]:
max(kmers, key = lambda x: kmers.get(x))


Out[33]:
'CATG'

In [34]:
maximum = max(kmers.values())
keys = [x for x,y in kmers.items() if y == maximum]

In [36]:
" ".join(keys)


Out[36]:
'CATG GCAT'

In [ ]: